In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import time
import seaborn as sns

from statlearning import plot_regressions
from statlearning import plot_histogram
from statlearning import plot_feature_importance

from scipy import stats
from scipy.stats import norm, skew
# import statsmodels.api as sm
# import statsmodels.formula.api as smf


from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

import xgboost as xgb

warnings.filterwarnings("ignore")
In [2]:
tic = time.time()

sns.set_context('notebook')
sns.set_style('ticks')
crayon = ['#4E79A7', '#F28E2C', '#E15759', '#76B7B2', '#59A14F',
          '#EDC949', '#AF7AA1', '#FF9DA7', '#9C755F', '#BAB0AB']
sns.set_palette("pastel")
plt.rcParams['figure.figsize'] = (9, 6)
%matplotlib inline
In [3]:
df = pd.read_csv("NBA_Train.csv")
df
Out[3]:
Train_ID SALARY POSITION TEAM Age Games Minutes PER TS ORB ... AST STL BLK TOV USG ORtg DRtg OWS DWS WS
0 1 2.489530 PF Houston Rockets 22 76 2078 19.1 0.577 9.2 ... 7.0 1.3 3.5 8.2 18.3 119 105 4.6 2.7 7.3
1 2 2.433333 PG Utah Jazz 24 73 1490 13.2 0.510 1.6 ... 28.8 1.7 0.1 14.3 19.0 105 109 1.2 1.2 2.3
2 3 20.158622 SF Oklahoma City Thunder 25 81 3122 29.8 0.635 2.2 ... 26.7 1.7 1.5 12.2 33.0 123 104 14.8 4.4 19.2
3 4 12.404495 PG Houston Rockets 26 62 2222 19.0 0.554 1.9 ... 38.2 2.2 0.3 16.9 22.7 113 110 4.8 1.5 6.3
4 5 1.500000 PG Brooklyn Nets 21 48 489 8.3 0.446 2.6 ... 20.4 2.7 0.2 20.9 17.4 90 110 -0.4 0.3 -0.1
5 6 2.854940 PF Miami Heat 33 46 653 10.5 0.523 8.4 ... 2.9 0.9 2.0 12.7 13.9 104 105 0.3 0.8 1.1
6 7 1.252440 SF Detroit Pistons 22 43 395 6.7 0.460 3.2 ... 4.3 1.1 0.2 9.4 15.4 93 108 -0.2 0.3 0.2
7 8 6.300000 PG Brooklyn Nets 30 80 2252 11.5 0.498 1.1 ... 22.7 1.3 0.8 14.8 17.8 102 110 0.9 1.4 2.3
8 9 15.514031 SG New Orleans Pelicans 25 64 2057 14.9 0.540 1.7 ... 16.6 1.9 0.5 12.8 23.2 105 113 2.0 0.5 2.5
9 10 12.000000 PG Toronto Raptors 27 79 2862 20.1 0.567 3.6 ... 34.7 2.2 0.4 13.4 22.9 118 106 8.4 3.3 11.7
10 11 3.553917 SF Toronto Raptors 22 81 2159 12.0 0.553 2.3 ... 6.1 1.5 1.0 10.0 18.6 107 107 2.0 2.2 4.2
11 12 6.270000 SG Detroit Pistons 26 77 2556 14.7 0.601 1.3 ... 8.5 2.1 0.1 9.9 19.1 112 113 3.8 0.7 4.5
12 13 4.440000 PF Washington Wizards 28 69 1376 18.2 0.552 10.2 ... 8.2 1.2 3.4 10.9 19.0 113 105 2.3 1.8 4.1
13 14 13.000000 C Washington Wizards 31 53 1560 16.6 0.530 4.7 ... 16.3 2.1 2.5 14.1 24.0 102 103 0.8 2.4 3.2
14 15 5.746479 SG Atlanta Hawks 32 71 2408 13.5 0.653 1.1 ... 13.3 1.5 0.8 13.6 14.2 118 108 3.9 2.0 5.9
15 16 15.851950 PG Washington Wizards 23 82 2980 19.5 0.524 1.5 ... 40.5 2.6 1.1 16.3 27.4 106 104 4.0 4.0 7.9
16 17 2.165160 C Boston Celtics 22 70 1400 15.2 0.546 11.0 ... 13.1 1.3 1.5 16.0 21.0 107 107 1.5 1.4 2.9
17 18 1.185784 SF Brooklyn Nets 26 54 768 9.1 0.466 3.2 ... 3.7 1.1 2.8 10.1 17.9 92 107 -0.4 0.8 0.3
18 19 4.236287 PG Portland Trail Blazers 23 82 2937 18.6 0.568 1.3 ... 25.1 1.1 0.5 11.5 25.0 116 110 7.8 1.8 9.6
19 20 9.756250 C Atlanta Hawks 29 59 1271 16.5 0.572 11.4 ... 10.1 1.2 1.8 15.1 17.7 114 102 2.3 2.1 4.3
20 21 2.380593 PF Orlando Magic 24 76 1174 9.9 0.489 5.1 ... 3.5 0.8 1.5 10.3 19.3 95 108 -0.4 1.0 0.6
21 22 22.359364 C Houston Rockets 28 71 2396 21.3 0.600 11.4 ... 9.3 1.2 4.0 17.5 24.0 109 101 4.0 4.1 8.0
22 23 7.500000 C San Antonio Spurs 31 79 1974 14.1 0.578 4.4 ... 16.3 1.1 1.2 16.3 17.0 109 104 2.3 2.6 4.9
23 24 5.219169 SF Los Angeles Lakers 28 64 1810 16.0 0.564 1.7 ... 9.0 1.2 0.5 8.6 26.8 107 114 2.3 0.3 2.5
24 25 4.000000 PF New York Knicks 22 78 1820 11.5 0.524 4.3 ... 4.7 1.3 0.9 11.3 16.2 105 109 1.3 1.3 2.6
25 26 6.500000 C Detroit Pistons 27 53 491 9.7 0.467 13.6 ... 9.9 0.2 0.7 17.3 19.2 97 105 -0.1 0.6 0.6
26 27 7.448760 C Chicago Bulls 33 60 1884 19.3 0.522 7.0 ... 18.7 0.7 3.5 12.4 26.4 102 108 1.3 1.7 3.0
27 28 4.500000 SG Chicago Bulls 33 82 2584 12.6 0.549 2.1 ... 12.3 1.3 1.4 11.3 17.0 108 102 2.4 4.1 6.5
28 29 3.333333 PF Atlanta Hawks 25 80 1482 15.3 0.559 4.9 ... 8.9 1.0 0.4 10.3 23.5 105 108 1.3 1.2 2.6
29 30 15.592217 C Los Angeles Lakers 27 81 2409 13.5 0.499 9.9 ... 6.2 0.6 5.7 14.5 19.4 100 98 0.5 5.0 5.4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
96 97 1.449187 SF Sacramento Kings 33 56 1419 12.2 0.512 1.7 ... 9.6 1.7 1.0 9.5 20.1 102 108 0.6 1.2 1.8
97 98 1.320000 SF New Orleans Pelicans 26 65 1020 8.6 0.491 3.8 ... 6.9 2.0 1.2 14.3 13.4 97 107 -0.1 1.0 0.9
98 99 3.750000 PF New York Knicks 23 69 1188 16.5 0.526 9.5 ... 10.9 1.7 5.8 16.1 18.6 102 101 0.5 2.0 2.5
99 100 2.854940 SG Chicago Bulls 33 73 2116 10.8 0.494 1.7 ... 22.6 2.0 1.0 15.1 17.3 100 102 0.4 3.3 3.8
100 101 2.399040 PG Milwaukee Bucks 22 70 2414 15.5 0.480 4.3 ... 30.2 2.6 1.4 16.9 25.7 96 108 -0.8 2.1 1.3
101 102 19.689000 PF Cleveland Cavaliers 25 77 2797 26.9 0.591 8.5 ... 21.4 1.0 1.0 10.3 28.8 120 104 10.6 3.7 14.3
102 103 12.403101 SF Sacramento Kings 27 73 2531 18.3 0.540 5.0 ... 15.1 1.9 1.8 14.2 27.6 104 107 2.3 2.5 4.8
103 104 13.500000 C Charlotte Hornets 29 73 2553 22.7 0.532 6.9 ... 12.8 1.4 2.4 7.6 29.3 105 100 3.1 4.7 7.8
104 105 1.015421 SG Sacramento Kings 24 80 2309 10.9 0.530 2.7 ... 9.9 1.6 1.0 12.2 15.9 102 112 0.9 0.9 1.9
105 106 4.500000 SF Oklahoma City Thunder 25 82 2337 11.8 0.574 5.2 ... 4.7 1.3 1.3 10.4 14.0 117 113 3.6 0.7 4.4
106 107 5.000000 PF Boston Celtics 26 64 741 13.4 0.577 7.2 ... 7.8 1.4 0.6 15.5 16.0 111 110 1.0 0.5 1.4
107 108 5.613500 SF Washington Wizards 27 78 2157 11.5 0.591 2.0 ... 6.7 1.0 0.6 8.3 14.7 116 108 3.3 1.7 5.0
108 109 0.111444 C Memphis Grizzlies 29 61 482 11.9 0.730 7.4 ... 2.4 0.9 4.9 23.1 11.5 117 105 0.7 0.6 1.3
109 110 5.000000 PF Memphis Grizzlies 35 72 1396 18.5 0.683 12.1 ... 2.1 1.2 6.1 13.2 13.8 130 102 3.8 2.2 6.0
110 111 2.038206 SF Boston Celtics 22 62 764 10.1 0.541 4.3 ... 4.9 0.9 2.0 9.5 12.8 110 107 0.7 0.8 1.5
111 112 11.370786 PG Golden State Warriors 25 78 2846 24.1 0.610 1.8 ... 39.9 2.2 0.4 16.1 28.3 117 104 9.3 4.0 13.4
112 113 2.900000 PF Toronto Raptors 33 82 1399 13.4 0.505 7.2 ... 10.4 1.0 0.9 14.8 23.8 96 99 -0.4 2.8 2.5
113 114 2.250000 PG Chicago Bulls 29 72 1557 12.5 0.518 3.1 ... 23.3 1.6 0.6 15.8 20.5 105 110 1.3 0.9 2.2
114 115 5.675000 SG Charlotte Hornets 28 79 1973 13.8 0.574 1.7 ... 9.3 1.7 1.2 9.9 17.0 113 107 2.8 1.9 4.7
115 116 12.000000 PG Charlotte Hornets 23 73 2614 16.8 0.499 1.7 ... 29.7 1.7 1.0 11.6 25.8 103 105 1.8 3.3 5.1
116 117 8.229375 SG Houston Rockets 27 81 2609 12.7 0.550 2.6 ... 7.9 2.9 0.9 10.4 16.5 108 107 2.4 2.7 5.2
117 118 16.407501 PG Cleveland Cavaliers 21 71 2496 20.1 0.533 2.3 ... 31.6 2.2 0.8 12.1 28.2 109 108 4.6 2.1 6.7
118 119 5.158539 SG Memphis Grizzlies 32 55 1278 15.6 0.531 7.3 ... 11.7 3.8 1.2 16.2 20.1 102 101 0.5 2.1 2.7
119 120 1.035000 PF Phoenix Suns 24 49 642 17.6 0.566 6.8 ... 5.2 1.5 1.7 9.2 21.1 112 103 1.1 0.9 2.0
120 121 4.345000 PG Denver Nuggets 31 68 2179 13.9 0.515 1.5 ... 35.1 1.2 0.2 17.3 20.1 104 111 1.6 1.2 2.8
121 122 5.543725 PF Miami Heat 26 78 2360 13.8 0.548 4.0 ... 21.9 1.3 1.5 12.1 13.4 115 105 3.5 3.0 6.5
122 123 3.156600 SG Sacramento Kings 20 82 2187 7.7 0.485 2.7 ... 5.8 1.0 0.7 11.5 16.9 97 112 -0.1 0.9 0.8
123 124 8.000000 PF Washington Wizards 24 81 2153 18.4 0.564 7.3 ... 10.9 1.6 1.8 12.9 23.2 111 106 4.0 2.4 6.4
124 125 8.193029 PF Orlando Magic 30 82 2312 13.2 0.555 3.6 ... 6.4 1.3 2.1 9.8 17.3 111 107 2.9 2.4 5.3
125 126 10.734586 SG New Orleans Pelicans 24 72 2028 18.4 0.504 4.3 ... 30.4 2.2 0.9 14.5 27.1 103 110 1.8 1.3 3.1

126 rows × 22 columns

Data Preprocessing

Useless Columns

In [4]:
df.drop(columns="Train_ID", inplace=True)

Missing value

In [5]:
df.isnull().sum()
Out[5]:
SALARY      0
POSITION    0
TEAM        0
Age         0
Games       0
Minutes     0
PER         0
TS          0
ORB         0
DRB         0
TRB         0
AST         0
STL         0
BLK         0
TOV         0
USG         0
ORtg        0
DRtg        0
OWS         0
DWS         0
WS          0
dtype: int64

Outliers

In [6]:
def _scatter(df, feature, target):
    """

    """
    # plt.figure(figsize=(16,8))
    plt.scatter(df[feature], df[target])
    plt.xlabel(feature)
    plt.ylabel(target)

    items = [feature, " vs ", target]
    title = ''.join(items)
    plt.title(title)


def subplot_scatter(df, target):
    """
    Plot scatter figures of each column in the dataFrame. 
    Args:
        df: pandas.DataFrame 
            DataFrame input.

        target: pandas.Series()
            Target column.


    """

    plt.figure(figsize=(30, 90))
    num_subplot = len(df.columns.drop(target))
    for i, col in enumerate(df.columns.drop(target)):
        plt.subplot(num_subplot//2 + 1, 2, i+1)
        _scatter(df, col, target)


def subplot_box(df):
    """

    """

    plt.figure(figsize=(30, 90))
    num_subplot = len(df.columns)
    for i, col in enumerate(df.columns):
        plt.subplot(num_subplot//2 + 1, 2, i+1)
        sns.set_palette("pastel")
        sns.boxplot(x=df[col])


def zscore_drop_missing(df, col_list, THRESHOLD=3):
    """
    
    """
    for col in col_list:
        z = np.abs(stats.zscore(df[col]))
        df = df[(z < THRESHOLD)]

    df.index = range(len(df))
    return df
In [7]:
col_category = ["POSITION", "TEAM"]
col_number = df.columns.drop(["TEAM", "POSITION","SALARY"])
col_numbers = df.columns.drop(["TEAM", "POSITION"])

# subplot_scatter(df, "SALARY")

with sns.color_palette(crayon):
    plot_regressions(df[col_number], df["SALARY"])
    plt.show()
In [8]:
subplot_box(df[col_numbers])
In [9]:
#"SALARY"
df = zscore_drop_missing(df, col_numbers, THRESHOLD=3.7)
In [10]:
df
Out[10]:
SALARY POSITION TEAM Age Games Minutes PER TS ORB DRB ... AST STL BLK TOV USG ORtg DRtg OWS DWS WS
0 2.489530 PF Houston Rockets 22 76 2078 19.1 0.577 9.2 18.5 ... 7.0 1.3 3.5 8.2 18.3 119 105 4.6 2.7 7.3
1 2.433333 PG Utah Jazz 24 73 1490 13.2 0.510 1.6 10.7 ... 28.8 1.7 0.1 14.3 19.0 105 109 1.2 1.2 2.3
2 12.404495 PG Houston Rockets 26 62 2222 19.0 0.554 1.9 8.5 ... 38.2 2.2 0.3 16.9 22.7 113 110 4.8 1.5 6.3
3 1.500000 PG Brooklyn Nets 21 48 489 8.3 0.446 2.6 7.3 ... 20.4 2.7 0.2 20.9 17.4 90 110 -0.4 0.3 -0.1
4 2.854940 PF Miami Heat 33 46 653 10.5 0.523 8.4 24.7 ... 2.9 0.9 2.0 12.7 13.9 104 105 0.3 0.8 1.1
5 1.252440 SF Detroit Pistons 22 43 395 6.7 0.460 3.2 11.7 ... 4.3 1.1 0.2 9.4 15.4 93 108 -0.2 0.3 0.2
6 6.300000 PG Brooklyn Nets 30 80 2252 11.5 0.498 1.1 10.3 ... 22.7 1.3 0.8 14.8 17.8 102 110 0.9 1.4 2.3
7 15.514031 SG New Orleans Pelicans 25 64 2057 14.9 0.540 1.7 7.7 ... 16.6 1.9 0.5 12.8 23.2 105 113 2.0 0.5 2.5
8 12.000000 PG Toronto Raptors 27 79 2862 20.1 0.567 3.6 11.5 ... 34.7 2.2 0.4 13.4 22.9 118 106 8.4 3.3 11.7
9 3.553917 SF Toronto Raptors 22 81 2159 12.0 0.553 2.3 11.3 ... 6.1 1.5 1.0 10.0 18.6 107 107 2.0 2.2 4.2
10 6.270000 SG Detroit Pistons 26 77 2556 14.7 0.601 1.3 6.9 ... 8.5 2.1 0.1 9.9 19.1 112 113 3.8 0.7 4.5
11 4.440000 PF Washington Wizards 28 69 1376 18.2 0.552 10.2 24.0 ... 8.2 1.2 3.4 10.9 19.0 113 105 2.3 1.8 4.1
12 13.000000 C Washington Wizards 31 53 1560 16.6 0.530 4.7 17.3 ... 16.3 2.1 2.5 14.1 24.0 102 103 0.8 2.4 3.2
13 5.746479 SG Atlanta Hawks 32 71 2408 13.5 0.653 1.1 12.4 ... 13.3 1.5 0.8 13.6 14.2 118 108 3.9 2.0 5.9
14 15.851950 PG Washington Wizards 23 82 2980 19.5 0.524 1.5 11.7 ... 40.5 2.6 1.1 16.3 27.4 106 104 4.0 4.0 7.9
15 2.165160 C Boston Celtics 22 70 1400 15.2 0.546 11.0 18.8 ... 13.1 1.3 1.5 16.0 21.0 107 107 1.5 1.4 2.9
16 1.185784 SF Brooklyn Nets 26 54 768 9.1 0.466 3.2 18.2 ... 3.7 1.1 2.8 10.1 17.9 92 107 -0.4 0.8 0.3
17 4.236287 PG Portland Trail Blazers 23 82 2937 18.6 0.568 1.3 9.2 ... 25.1 1.1 0.5 11.5 25.0 116 110 7.8 1.8 9.6
18 9.756250 C Atlanta Hawks 29 59 1271 16.5 0.572 11.4 20.5 ... 10.1 1.2 1.8 15.1 17.7 114 102 2.3 2.1 4.3
19 2.380593 PF Orlando Magic 24 76 1174 9.9 0.489 5.1 19.5 ... 3.5 0.8 1.5 10.3 19.3 95 108 -0.4 1.0 0.6
20 22.359364 C Houston Rockets 28 71 2396 21.3 0.600 11.4 27.9 ... 9.3 1.2 4.0 17.5 24.0 109 101 4.0 4.1 8.0
21 7.500000 C San Antonio Spurs 31 79 1974 14.1 0.578 4.4 13.8 ... 16.3 1.1 1.2 16.3 17.0 109 104 2.3 2.6 4.9
22 5.219169 SF Los Angeles Lakers 28 64 1810 16.0 0.564 1.7 8.1 ... 9.0 1.2 0.5 8.6 26.8 107 114 2.3 0.3 2.5
23 4.000000 PF New York Knicks 22 78 1820 11.5 0.524 4.3 15.8 ... 4.7 1.3 0.9 11.3 16.2 105 109 1.3 1.3 2.6
24 6.500000 C Detroit Pistons 27 53 491 9.7 0.467 13.6 19.4 ... 9.9 0.2 0.7 17.3 19.2 97 105 -0.1 0.6 0.6
25 7.448760 C Chicago Bulls 33 60 1884 19.3 0.522 7.0 25.9 ... 18.7 0.7 3.5 12.4 26.4 102 108 1.3 1.7 3.0
26 4.500000 SG Chicago Bulls 33 82 2584 12.6 0.549 2.1 13.0 ... 12.3 1.3 1.4 11.3 17.0 108 102 2.4 4.1 6.5
27 3.333333 PF Atlanta Hawks 25 80 1482 15.3 0.559 4.9 17.2 ... 8.9 1.0 0.4 10.3 23.5 105 108 1.3 1.2 2.6
28 15.592217 C Los Angeles Lakers 27 81 2409 13.5 0.499 9.9 15.0 ... 6.2 0.6 5.7 14.5 19.4 100 98 0.5 5.0 5.4
29 2.144772 PG Philadelphia 76ers 22 54 1564 12.6 0.494 1.0 9.6 ... 44.3 1.5 0.2 25.6 16.3 101 113 0.6 0.3 0.9
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
90 2.814000 SG San Antonio Spurs 36 68 1550 20.0 0.590 2.3 12.0 ... 29.0 2.3 0.8 16.4 24.7 112 103 3.3 2.4 5.7
91 1.449187 SF Sacramento Kings 33 56 1419 12.2 0.512 1.7 16.8 ... 9.6 1.7 1.0 9.5 20.1 102 108 0.6 1.2 1.8
92 1.320000 SF New Orleans Pelicans 26 65 1020 8.6 0.491 3.8 12.8 ... 6.9 2.0 1.2 14.3 13.4 97 107 -0.1 1.0 0.9
93 3.750000 PF New York Knicks 23 69 1188 16.5 0.526 9.5 25.2 ... 10.9 1.7 5.8 16.1 18.6 102 101 0.5 2.0 2.5
94 2.854940 SG Chicago Bulls 33 73 2116 10.8 0.494 1.7 8.5 ... 22.6 2.0 1.0 15.1 17.3 100 102 0.4 3.3 3.8
95 2.399040 PG Milwaukee Bucks 22 70 2414 15.5 0.480 4.3 15.6 ... 30.2 2.6 1.4 16.9 25.7 96 108 -0.8 2.1 1.3
96 19.689000 PF Cleveland Cavaliers 25 77 2797 26.9 0.591 8.5 29.5 ... 21.4 1.0 1.0 10.3 28.8 120 104 10.6 3.7 14.3
97 12.403101 SF Sacramento Kings 27 73 2531 18.3 0.540 5.0 14.8 ... 15.1 1.9 1.8 14.2 27.6 104 107 2.3 2.5 4.8
98 13.500000 C Charlotte Hornets 29 73 2553 22.7 0.532 6.9 28.3 ... 12.8 1.4 2.4 7.6 29.3 105 100 3.1 4.7 7.8
99 1.015421 SG Sacramento Kings 24 80 2309 10.9 0.530 2.7 11.7 ... 9.9 1.6 1.0 12.2 15.9 102 112 0.9 0.9 1.9
100 4.500000 SF Oklahoma City Thunder 25 82 2337 11.8 0.574 5.2 9.2 ... 4.7 1.3 1.3 10.4 14.0 117 113 3.6 0.7 4.4
101 5.000000 PF Boston Celtics 26 64 741 13.4 0.577 7.2 19.3 ... 7.8 1.4 0.6 15.5 16.0 111 110 1.0 0.5 1.4
102 5.613500 SF Washington Wizards 27 78 2157 11.5 0.591 2.0 10.1 ... 6.7 1.0 0.6 8.3 14.7 116 108 3.3 1.7 5.0
103 5.000000 PF Memphis Grizzlies 35 72 1396 18.5 0.683 12.1 21.7 ... 2.1 1.2 6.1 13.2 13.8 130 102 3.8 2.2 6.0
104 2.038206 SF Boston Celtics 22 62 764 10.1 0.541 4.3 12.1 ... 4.9 0.9 2.0 9.5 12.8 110 107 0.7 0.8 1.5
105 11.370786 PG Golden State Warriors 25 78 2846 24.1 0.610 1.8 10.9 ... 39.9 2.2 0.4 16.1 28.3 117 104 9.3 4.0 13.4
106 2.900000 PF Toronto Raptors 33 82 1399 13.4 0.505 7.2 23.4 ... 10.4 1.0 0.9 14.8 23.8 96 99 -0.4 2.8 2.5
107 2.250000 PG Chicago Bulls 29 72 1557 12.5 0.518 3.1 6.7 ... 23.3 1.6 0.6 15.8 20.5 105 110 1.3 0.9 2.2
108 5.675000 SG Charlotte Hornets 28 79 1973 13.8 0.574 1.7 9.5 ... 9.3 1.7 1.2 9.9 17.0 113 107 2.8 1.9 4.7
109 12.000000 PG Charlotte Hornets 23 73 2614 16.8 0.499 1.7 11.6 ... 29.7 1.7 1.0 11.6 25.8 103 105 1.8 3.3 5.1
110 8.229375 SG Houston Rockets 27 81 2609 12.7 0.550 2.6 6.2 ... 7.9 2.9 0.9 10.4 16.5 108 107 2.4 2.7 5.2
111 16.407501 PG Cleveland Cavaliers 21 71 2496 20.1 0.533 2.3 9.5 ... 31.6 2.2 0.8 12.1 28.2 109 108 4.6 2.1 6.7
112 5.158539 SG Memphis Grizzlies 32 55 1278 15.6 0.531 7.3 12.0 ... 11.7 3.8 1.2 16.2 20.1 102 101 0.5 2.1 2.7
113 1.035000 PF Phoenix Suns 24 49 642 17.6 0.566 6.8 22.4 ... 5.2 1.5 1.7 9.2 21.1 112 103 1.1 0.9 2.0
114 4.345000 PG Denver Nuggets 31 68 2179 13.9 0.515 1.5 10.5 ... 35.1 1.2 0.2 17.3 20.1 104 111 1.6 1.2 2.8
115 5.543725 PF Miami Heat 26 78 2360 13.8 0.548 4.0 13.8 ... 21.9 1.3 1.5 12.1 13.4 115 105 3.5 3.0 6.5
116 3.156600 SG Sacramento Kings 20 82 2187 7.7 0.485 2.7 9.5 ... 5.8 1.0 0.7 11.5 16.9 97 112 -0.1 0.9 0.8
117 8.000000 PF Washington Wizards 24 81 2153 18.4 0.564 7.3 18.0 ... 10.9 1.6 1.8 12.9 23.2 111 106 4.0 2.4 6.4
118 8.193029 PF Orlando Magic 30 82 2312 13.2 0.555 3.6 16.6 ... 6.4 1.3 2.1 9.8 17.3 111 107 2.9 2.4 5.3
119 10.734586 SG New Orleans Pelicans 24 72 2028 18.4 0.504 4.3 15.4 ... 30.4 2.2 0.9 14.5 27.1 103 110 1.8 1.3 3.1

120 rows × 21 columns

In [11]:
# subplot_scatter(df, "SALARY")
with sns.color_palette(crayon):
    plot_regressions(df[col_number], df["SALARY"])
    plt.show()
In [12]:
subplot_box(df[col_numbers])

Normalization

In [13]:
normalized  = lambda x: (x - x.min()) * (1-0) / (x.max()-x.min())

normalized(df[col_number])
Out[13]:
Age Games Minutes PER TS ORB DRB TRB AST STL BLK TOV USG ORtg DRtg OWS DWS WS
0 0.166667 0.857143 0.650830 0.613861 0.552743 0.496970 0.474903 0.549451 0.116114 0.305556 0.573770 0.052910 0.389610 0.725 0.411765 0.473684 0.54 0.513889
1 0.277778 0.785714 0.423716 0.321782 0.270042 0.036364 0.173745 0.115385 0.632701 0.416667 0.016393 0.375661 0.419913 0.375 0.647059 0.175439 0.24 0.166667
2 0.388889 0.523810 0.706450 0.608911 0.455696 0.054545 0.088803 0.060440 0.855450 0.555556 0.049180 0.513228 0.580087 0.575 0.705882 0.491228 0.30 0.444444
3 0.111111 0.190476 0.037080 0.079208 0.000000 0.096970 0.042471 0.049451 0.433649 0.694444 0.032787 0.724868 0.350649 0.000 0.705882 0.035088 0.06 0.000000
4 0.777778 0.142857 0.100425 0.188119 0.324895 0.448485 0.714286 0.697802 0.018957 0.194444 0.327869 0.291005 0.199134 0.350 0.411765 0.096491 0.16 0.083333
5 0.166667 0.071429 0.000772 0.000000 0.059072 0.133333 0.212355 0.192308 0.052133 0.250000 0.032787 0.116402 0.264069 0.075 0.588235 0.052632 0.06 0.020833
6 0.611111 0.952381 0.718038 0.237624 0.219409 0.006061 0.158301 0.082418 0.488152 0.305556 0.131148 0.402116 0.367965 0.300 0.705882 0.149123 0.28 0.166667
7 0.333333 0.571429 0.642719 0.405941 0.396624 0.042424 0.057915 0.032967 0.343602 0.472222 0.081967 0.296296 0.601732 0.375 0.882353 0.245614 0.10 0.180556
8 0.444444 0.928571 0.953650 0.663366 0.510549 0.157576 0.204633 0.186813 0.772512 0.555556 0.065574 0.328042 0.588745 0.700 0.470588 0.807018 0.66 0.819444
9 0.166667 0.976190 0.682117 0.262376 0.451477 0.078788 0.196911 0.148352 0.094787 0.361111 0.163934 0.148148 0.402597 0.425 0.529412 0.245614 0.44 0.298611
10 0.388889 0.880952 0.835458 0.396040 0.654008 0.018182 0.027027 0.000000 0.151659 0.527778 0.016393 0.142857 0.424242 0.550 0.882353 0.403509 0.14 0.319444
11 0.500000 0.690476 0.379683 0.569307 0.447257 0.557576 0.687259 0.703297 0.144550 0.277778 0.557377 0.195767 0.419913 0.575 0.411765 0.271930 0.36 0.291667
12 0.666667 0.309524 0.450753 0.490099 0.354430 0.224242 0.428571 0.373626 0.336493 0.527778 0.409836 0.365079 0.636364 0.300 0.294118 0.140351 0.48 0.229167
13 0.722222 0.738095 0.778293 0.336634 0.873418 0.006061 0.239382 0.148352 0.265403 0.361111 0.131148 0.338624 0.212121 0.700 0.588235 0.412281 0.40 0.416667
14 0.222222 1.000000 0.999228 0.633663 0.329114 0.030303 0.212355 0.131868 0.909953 0.666667 0.180328 0.481481 0.783550 0.400 0.352941 0.421053 0.80 0.555556
15 0.166667 0.714286 0.388953 0.420792 0.421941 0.606061 0.486486 0.587912 0.260664 0.305556 0.245902 0.465608 0.506494 0.425 0.529412 0.201754 0.28 0.208333
16 0.388889 0.333333 0.144844 0.118812 0.084388 0.133333 0.463320 0.351648 0.037915 0.250000 0.459016 0.153439 0.372294 0.050 0.529412 0.035088 0.16 0.027778
17 0.222222 1.000000 0.982619 0.589109 0.514768 0.018182 0.115830 0.065934 0.545024 0.250000 0.081967 0.227513 0.679654 0.650 0.705882 0.754386 0.36 0.673611
18 0.555556 0.452381 0.339127 0.485149 0.531646 0.630303 0.552124 0.659341 0.189573 0.277778 0.295082 0.417989 0.363636 0.600 0.235294 0.271930 0.42 0.305556
19 0.277778 0.857143 0.301661 0.158416 0.181435 0.248485 0.513514 0.450549 0.033175 0.166667 0.245902 0.164021 0.432900 0.125 0.588235 0.035088 0.20 0.048611
20 0.500000 0.738095 0.773658 0.722772 0.649789 0.630303 0.837838 0.879121 0.170616 0.277778 0.655738 0.544974 0.636364 0.475 0.176471 0.421053 0.82 0.562500
21 0.666667 0.928571 0.610660 0.366337 0.556962 0.206061 0.293436 0.285714 0.336493 0.250000 0.196721 0.481481 0.333333 0.475 0.352941 0.271930 0.52 0.347222
22 0.500000 0.571429 0.547316 0.460396 0.497890 0.042424 0.073359 0.043956 0.163507 0.277778 0.081967 0.074074 0.757576 0.425 0.941176 0.271930 0.06 0.180556
23 0.166667 0.904762 0.551178 0.237624 0.329114 0.200000 0.370656 0.324176 0.061611 0.305556 0.147541 0.216931 0.298701 0.375 0.647059 0.184211 0.26 0.187500
24 0.444444 0.309524 0.037852 0.148515 0.088608 0.763636 0.509653 0.686813 0.184834 0.000000 0.114754 0.534392 0.428571 0.175 0.411765 0.061404 0.12 0.048611
25 0.777778 0.476190 0.575898 0.623762 0.320675 0.363636 0.760618 0.675824 0.393365 0.138889 0.573770 0.275132 0.740260 0.300 0.588235 0.184211 0.34 0.215278
26 0.777778 1.000000 0.846273 0.292079 0.434599 0.066667 0.262548 0.192308 0.241706 0.305556 0.229508 0.216931 0.333333 0.450 0.235294 0.280702 0.82 0.458333
27 0.333333 0.952381 0.420626 0.425743 0.476793 0.236364 0.424710 0.384615 0.161137 0.222222 0.065574 0.164021 0.614719 0.375 0.588235 0.184211 0.24 0.187500
28 0.444444 0.976190 0.778679 0.336634 0.223629 0.539394 0.339768 0.461538 0.097156 0.111111 0.934426 0.386243 0.437229 0.250 0.000000 0.114035 1.00 0.381944
29 0.166667 0.333333 0.452298 0.292079 0.202532 0.000000 0.131274 0.065934 1.000000 0.361111 0.032787 0.973545 0.303030 0.275 0.882353 0.122807 0.06 0.069444
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
90 0.944444 0.666667 0.446891 0.658416 0.607595 0.078788 0.223938 0.175824 0.637441 0.583333 0.131148 0.486772 0.666667 0.550 0.294118 0.359649 0.48 0.402778
91 0.777778 0.380952 0.396292 0.272277 0.278481 0.042424 0.409266 0.280220 0.177725 0.416667 0.163934 0.121693 0.467532 0.300 0.588235 0.122807 0.24 0.131944
92 0.388889 0.595238 0.242178 0.094059 0.189873 0.169697 0.254826 0.225275 0.113744 0.500000 0.196721 0.375661 0.177489 0.175 0.529412 0.061404 0.20 0.069444
93 0.222222 0.690476 0.307068 0.485149 0.337553 0.515152 0.733591 0.725275 0.208531 0.416667 0.950820 0.470899 0.402597 0.300 0.176471 0.114035 0.40 0.180556
94 0.777778 0.785714 0.665508 0.202970 0.202532 0.042424 0.088803 0.060440 0.485782 0.500000 0.163934 0.417989 0.346320 0.250 0.235294 0.105263 0.66 0.270833
95 0.166667 0.714286 0.780610 0.435644 0.143460 0.200000 0.362934 0.313187 0.665877 0.666667 0.229508 0.513228 0.709957 0.150 0.588235 0.000000 0.42 0.097222
96 0.333333 0.880952 0.928544 1.000000 0.611814 0.454545 0.899614 0.802198 0.457346 0.222222 0.163934 0.164021 0.844156 0.750 0.352941 1.000000 0.74 1.000000
97 0.444444 0.785714 0.825801 0.574257 0.396624 0.242424 0.332046 0.313187 0.308057 0.472222 0.295082 0.370370 0.792208 0.350 0.529412 0.271930 0.50 0.340278
98 0.555556 0.785714 0.834299 0.792079 0.362869 0.357576 0.853282 0.736264 0.253555 0.333333 0.393443 0.021164 0.865801 0.375 0.117647 0.342105 0.94 0.548611
99 0.277778 0.952381 0.740054 0.207921 0.354430 0.103030 0.212355 0.159341 0.184834 0.388889 0.163934 0.264550 0.285714 0.300 0.823529 0.149123 0.18 0.138889
100 0.333333 1.000000 0.750869 0.252475 0.540084 0.254545 0.115830 0.164835 0.061611 0.305556 0.213115 0.169312 0.203463 0.675 0.882353 0.385965 0.14 0.312500
101 0.388889 0.571429 0.134415 0.331683 0.552743 0.375758 0.505792 0.483516 0.135071 0.333333 0.098361 0.439153 0.290043 0.525 0.705882 0.157895 0.10 0.104167
102 0.444444 0.904762 0.681344 0.237624 0.611814 0.060606 0.150579 0.104396 0.109005 0.222222 0.098361 0.058201 0.233766 0.650 0.588235 0.359649 0.34 0.354167
103 0.888889 0.761905 0.387408 0.584158 1.000000 0.672727 0.598456 0.714286 0.000000 0.277778 1.000000 0.317460 0.194805 1.000 0.235294 0.403509 0.44 0.423611
104 0.166667 0.523810 0.143299 0.168317 0.400844 0.200000 0.227799 0.236264 0.066351 0.194444 0.327869 0.121693 0.151515 0.500 0.529412 0.131579 0.16 0.111111
105 0.333333 0.904762 0.947470 0.861386 0.691983 0.048485 0.181467 0.126374 0.895735 0.555556 0.065574 0.470899 0.822511 0.675 0.352941 0.885965 0.80 0.937500
106 0.777778 1.000000 0.388567 0.331683 0.248945 0.375758 0.664093 0.637363 0.196682 0.222222 0.147541 0.402116 0.627706 0.150 0.058824 0.035088 0.56 0.180556
107 0.555556 0.761905 0.449594 0.287129 0.303797 0.127273 0.019305 0.043956 0.502370 0.388889 0.098361 0.455026 0.484848 0.375 0.705882 0.184211 0.18 0.159722
108 0.500000 0.928571 0.610274 0.351485 0.540084 0.042424 0.127413 0.082418 0.170616 0.416667 0.196721 0.142857 0.333333 0.575 0.529412 0.315789 0.38 0.333333
109 0.222222 0.785714 0.857860 0.500000 0.223629 0.042424 0.208494 0.137363 0.654028 0.416667 0.163934 0.232804 0.714286 0.325 0.411765 0.228070 0.66 0.361111
110 0.444444 0.976190 0.855929 0.297030 0.438819 0.096970 0.000000 0.010989 0.137441 0.750000 0.147541 0.169312 0.311688 0.450 0.529412 0.280702 0.54 0.368056
111 0.111111 0.738095 0.812283 0.663366 0.367089 0.078788 0.127413 0.093407 0.699052 0.555556 0.131148 0.259259 0.818182 0.475 0.588235 0.473684 0.42 0.472222
112 0.722222 0.357143 0.341831 0.440594 0.358650 0.381818 0.223938 0.302198 0.227488 1.000000 0.196721 0.476190 0.467532 0.300 0.176471 0.114035 0.42 0.194444
113 0.277778 0.214286 0.096176 0.539604 0.506329 0.351515 0.625483 0.576923 0.073460 0.361111 0.278689 0.105820 0.510823 0.550 0.294118 0.166667 0.18 0.145833
114 0.666667 0.666667 0.689842 0.356436 0.291139 0.030303 0.166023 0.104396 0.781991 0.277778 0.032787 0.534392 0.467532 0.350 0.764706 0.210526 0.24 0.201389
115 0.388889 0.904762 0.759753 0.351485 0.430380 0.181818 0.293436 0.263736 0.469194 0.305556 0.245902 0.259259 0.177489 0.625 0.411765 0.377193 0.60 0.458333
116 0.055556 1.000000 0.692932 0.049505 0.164557 0.103030 0.127413 0.109890 0.087678 0.222222 0.114754 0.227513 0.329004 0.175 0.823529 0.061404 0.18 0.062500
117 0.277778 0.976190 0.679799 0.579208 0.497890 0.381818 0.455598 0.467033 0.208531 0.388889 0.295082 0.301587 0.601732 0.525 0.470588 0.421053 0.48 0.451389
118 0.611111 1.000000 0.741213 0.321782 0.459916 0.157576 0.401544 0.329670 0.101896 0.305556 0.344262 0.137566 0.346320 0.525 0.529412 0.324561 0.48 0.375000
119 0.277778 0.761905 0.631518 0.579208 0.244726 0.200000 0.355212 0.313187 0.670616 0.555556 0.147541 0.386243 0.770563 0.325 0.705882 0.228070 0.26 0.222222

120 rows × 18 columns

Exploratory data analysis (EDA)

In [14]:
def get_status(x):
    return pd.DataFrame([x.count(), x.mean(), x.std(), x.min(), x.quantile(.25), x.quantile(.5), x.quantile(.75), x.max(), x.median(), x.mad(), x.var(), x.std(), x.skew(), x.kurt(), ],
                        index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'median', 'mad', 'var', 'std', 'skew', 'kurt', ]).round(3)


def plot_distributed(series):
    
    sns.distplot(series, fit=norm, bins=10, color='cornflowerblue')
    (mu, sigma) = norm.fit(series)
    plt.legend(['Normal dist.($\mu=$ {:.2f} and $\sigma=${:.2f})'.format(
        mu, sigma)], loc='best')
    plt.ylabel('Frequency')
    plt.title('Distribution of '+series.name)

def subplot_distributed(df):
    """

    """
    plt.figure(figsize=(30, 90))
    num_subplot = len(df.columns)
    for i, col in enumerate(df.columns):
        plt.subplot(num_subplot//2 + 1, 2, i+1)
        sns.set_palette("pastel")
        
        plot_distributed(df[col])

def plot_prob(series):
    stats.probplot(series,plot=plt)
    

def subplot_prob(df):
    """

    """
    plt.figure(figsize=(30, 90))
    num_subplot = len(df.columns)
    for i, col in enumerate(df.columns):
        plt.subplot(num_subplot//2 + 1, 2, i+1)
        sns.set_palette("pastel")
        plot_prob(df[col])
        

Target Variable

In [15]:
get_status(df["SALARY"])
Out[15]:
0
count 120.000
mean 6.603
std 5.286
min 0.947
25% 2.425
50% 4.500
75% 9.104
max 22.875
median 4.500
mad 4.222
var 27.947
std 5.286
skew 1.218
kurt 0.781
In [16]:
plot_distributed(df["SALARY"])
In [17]:
plot_prob(df["SALARY"])
In [18]:
get_status(df["SALARY"]**(1/3))
Out[18]:
0
count 120.000
mean 1.750
std 0.480
min 0.982
25% 1.343
50% 1.651
75% 2.088
max 2.839
median 1.651
mad 0.399
var 0.230
std 0.480
skew 0.380
kurt -0.720
In [19]:
plot_distributed(df["SALARY"]**(1/3))
In [20]:
plot_prob(df["SALARY"]**(1/3))

Numerical Variables

In [21]:
df_eda = normalized(df[col_number])
get_status(df_eda)
Out[21]:
Age Games Minutes PER TS ORB DRB TRB AST STL BLK TOV USG ORtg DRtg OWS DWS WS
count 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000
mean 0.429 0.704 0.546 0.401 0.397 0.249 0.336 0.324 0.292 0.385 0.238 0.318 0.459 0.428 0.532 0.263 0.357 0.284
std 0.229 0.270 0.260 0.191 0.173 0.231 0.232 0.254 0.230 0.171 0.203 0.201 0.200 0.179 0.213 0.195 0.216 0.202
min 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
25% 0.222 0.524 0.365 0.286 0.270 0.067 0.140 0.104 0.118 0.278 0.098 0.169 0.318 0.300 0.353 0.123 0.180 0.137
50% 0.389 0.774 0.566 0.361 0.395 0.164 0.282 0.253 0.217 0.361 0.172 0.272 0.424 0.425 0.529 0.215 0.320 0.226
75% 0.556 0.929 0.763 0.486 0.511 0.377 0.491 0.516 0.403 0.500 0.295 0.435 0.602 0.531 0.706 0.346 0.480 0.384
max 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
median 0.389 0.774 0.566 0.361 0.395 0.164 0.282 0.253 0.217 0.361 0.172 0.272 0.424 0.425 0.529 0.215 0.320 0.226
mad 0.185 0.224 0.222 0.148 0.137 0.190 0.194 0.216 0.184 0.133 0.151 0.158 0.163 0.142 0.174 0.142 0.177 0.156
var 0.052 0.073 0.067 0.037 0.030 0.054 0.054 0.065 0.053 0.029 0.041 0.040 0.040 0.032 0.045 0.038 0.047 0.041
std 0.229 0.270 0.260 0.191 0.173 0.231 0.232 0.254 0.230 0.171 0.203 0.201 0.200 0.179 0.213 0.195 0.216 0.202
skew 0.455 -0.822 -0.184 0.720 0.465 1.070 0.686 0.678 1.104 0.820 1.593 1.031 0.375 0.291 -0.130 1.698 0.718 1.304
kurt -0.383 -0.303 -0.925 0.428 0.690 0.197 -0.349 -0.758 0.466 1.211 2.664 1.229 -0.391 0.383 -0.609 3.441 -0.048 1.873
In [22]:
subplot_distributed(df_eda)
In [23]:
subplot_prob(df_eda)
In [24]:
skew_high_cols = ["BLK", "OWS", "WS"]
for col in skew_high_cols:
    df_eda[col] = df_eda[col] ** (1/3)

skew_medium_cols = ["PER", "TS", "ORB",
                    "DRB", "TRB", "AST", "STL", "TOV", "DWS"]
for col in skew_medium_cols:
    df_eda[col] = np.sqrt(df_eda[col])

skew_low_log = ["Games"]
for col in skew_low_log:
    df_eda[col] = df_eda[col] ** 2
In [25]:
get_status(df_eda)
Out[25]:
Age Games Minutes PER TS ORB DRB TRB AST STL BLK TOV USG ORtg DRtg OWS DWS WS
count 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000 120.000
mean 0.429 0.567 0.546 0.613 0.612 0.442 0.541 0.520 0.498 0.604 0.572 0.534 0.459 0.428 0.532 0.603 0.568 0.619
std 0.229 0.329 0.260 0.158 0.149 0.231 0.210 0.232 0.211 0.143 0.173 0.182 0.200 0.179 0.213 0.158 0.188 0.162
min 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
25% 0.222 0.274 0.365 0.535 0.520 0.258 0.374 0.323 0.343 0.527 0.462 0.411 0.318 0.300 0.353 0.497 0.424 0.516
50% 0.389 0.599 0.566 0.601 0.628 0.404 0.531 0.503 0.466 0.601 0.556 0.522 0.424 0.425 0.529 0.599 0.566 0.609
75% 0.556 0.862 0.763 0.697 0.715 0.614 0.701 0.719 0.635 0.707 0.666 0.660 0.602 0.531 0.706 0.702 0.693 0.727
max 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
median 0.389 0.599 0.566 0.601 0.628 0.404 0.531 0.503 0.466 0.601 0.556 0.522 0.424 0.425 0.529 0.599 0.566 0.609
mad 0.185 0.286 0.222 0.119 0.115 0.194 0.176 0.199 0.169 0.110 0.135 0.145 0.163 0.142 0.174 0.121 0.154 0.128
var 0.052 0.108 0.067 0.025 0.022 0.053 0.044 0.054 0.044 0.020 0.030 0.033 0.040 0.032 0.045 0.025 0.035 0.026
std 0.229 0.329 0.260 0.158 0.149 0.231 0.210 0.232 0.211 0.143 0.173 0.182 0.200 0.179 0.213 0.158 0.188 0.162
skew 0.455 -0.239 -0.184 -0.312 -0.646 0.387 -0.015 0.144 0.319 -0.339 0.044 0.019 0.375 0.291 -0.130 -0.186 -0.065 -0.310
kurt -0.383 -1.306 -0.925 1.340 1.875 -0.802 -0.647 -1.069 -0.386 2.105 0.404 0.202 -0.391 0.383 -0.609 1.512 -0.192 1.111
In [26]:
subplot_distributed(df_eda)
In [27]:
subplot_prob(df_eda)
In [28]:
df_eda = pd.concat([df_eda,np.log(df["SALARY"])],axis=1)
plt.figure(figsize = (10,8))
sns.heatmap(df_eda.corr(), vmin = 0, vmax = 1, cmap = 'Reds')
plt.title('Correlation Among Variables', fontsize = 16)
Out[28]:
Text(0.5, 1, 'Correlation Among Variables')
In [29]:
print(abs(df_eda.corr()["SALARY"]).sort_values(ascending=False))
SALARY     1.000000
Minutes    0.612484
WS         0.595247
PER        0.545343
DWS        0.536237
OWS        0.492768
USG        0.486173
AST        0.275239
TS         0.230146
DRtg       0.216809
ORtg       0.208605
Games      0.188585
Age        0.124599
DRB        0.116490
TRB        0.079937
BLK        0.071221
STL        0.070923
TOV        0.061397
ORB        0.019530
Name: SALARY, dtype: float64
In [30]:
sns.pairplot(df_eda[["Games","Minutes"]])
Out[30]:
<seaborn.axisgrid.PairGrid at 0x1c192f19e8>
In [31]:
with sns.color_palette(crayon):
    plot_regressions(df_eda[["Games","Minutes"]], df_eda["SALARY"])
    plt.show()
In [32]:
sns.pairplot(df_eda[["ORB","DRB","TRB"]])
Out[32]:
<seaborn.axisgrid.PairGrid at 0x1c190a6128>
In [33]:
with sns.color_palette(crayon):
    plot_regressions(df_eda[["ORB","DRB","TRB"]], df_eda["SALARY"])
    plt.show()
In [34]:
sns.pairplot(df_eda[["OWS","DWS","WS"]])
Out[34]:
<seaborn.axisgrid.PairGrid at 0x1c182bcdd8>
In [35]:
with sns.color_palette(crayon):
    plot_regressions(df_eda[["ORB","DRB","TRB"]], df_eda["SALARY"])
    plt.show()

Category Variables

In [36]:
plt.figure(figsize=(20,10))
sns.set_palette("pastel")
sns.boxplot(x = 'TEAM', y = 'SALARY', data = df)
sns.despine()
plt.title('Correlation between SALARY and TEAM', fontsize = 16)
plt.xticks(rotation=270)
plt.show()
In [37]:
sns.set_palette("pastel")
sns.boxplot(x = 'POSITION', y = 'SALARY', data = df)
sns.despine()
plt.title('Correlation between SALARY and TEAM', fontsize = 16)
plt.show()
In [38]:
def fe_categorical_var(df, cols):
    d = pd.DataFrame()
    for col in cols:
        dummies = pd.get_dummies(
            df[col], prefix_sep="_", drop_first=True, prefix=col)
        #df.drop(columns=col, inplace=True)
        d = pd.concat([d, dummies], axis=1)

    return d


temp = pd.concat([fe_categorical_var(df, col_category),
                  df["SALARY"]], axis=1)
In [39]:
plt.figure(figsize=(10, 8))
sns.heatmap(temp.corr(), vmin=0, vmax=1, cmap='Reds')
plt.title('Correlation Among Variables', fontsize=16)
Out[39]:
Text(0.5, 1, 'Correlation Among Variables')
In [40]:
print(abs(temp.corr()["SALARY"]).sort_values(ascending=False))
SALARY                         1.000000
TEAM_Cleveland Cavaliers       0.225236
TEAM_Brooklyn Nets             0.194550
TEAM_Houston Rockets           0.161096
TEAM_Los Angeles Clippers      0.139010
TEAM_Oklahoma City Thunder     0.126884
TEAM_San Antonio Spurs         0.119863
TEAM_Chicago Bulls             0.118384
TEAM_Golden State Warriors     0.110301
TEAM_New York Knicks           0.107711
TEAM_Boston Celtics            0.107521
TEAM_Miami Heat                0.107052
TEAM_Minnesota Timberwolves    0.098854
TEAM_Phoenix Suns              0.096951
TEAM_Detroit Pistons           0.080391
TEAM_Sacramento Kings          0.079023
TEAM_Utah Jazz                 0.074153
TEAM_Indiana Pacers            0.059490
POSITION_PF                    0.057057
POSITION_PG                    0.052200
TEAM_Orlando Magic             0.049814
TEAM_New Orleans Pelicans      0.048921
TEAM_Dallas Mavericks          0.047797
TEAM_Memphis Grizzlies         0.042341
TEAM_Denver Nuggets            0.039313
TEAM_Milwaukee Bucks           0.030507
POSITION_SF                    0.029899
TEAM_Los Angeles Lakers        0.027655
TEAM_Portland Trail Blazers    0.024585
TEAM_Washington Wizards        0.015189
TEAM_Toronto Raptors           0.013729
TEAM_Philadelphia 76ers        0.011805
POSITION_SG                    0.011310
TEAM_Charlotte Hornets         0.000960
Name: SALARY, dtype: float64

Feature engineering

In [41]:
def fe_target_var(df):
#     return pd.DataFrame(np.log(df["SALARY"]))
#     return pd.DataFrame(df["SALARY"])
    return pd.DataFrame(df["SALARY"]**(1/3))

def fe_numerical_var(df):

    df_number = normalized(df[col_number])
    for col in skew_high_cols:
        df_eda[col] = df_eda[col]** (1/3)
    
    for col in skew_medium_cols:
        df_number[col] = np.sqrt(df_number[col])

    for col in skew_low_log:
        df_number[col] = df_number[col] ** 2

    df_number.drop(columns=["Games","TRB", "ORB",
                            "DWS", "OWS"], axis=1, inplace=True)

    return df_number




def fe_categorical_var(df, cols):
    d = pd.DataFrame()
    for col in cols:
        dummies = pd.get_dummies(
            df[col], prefix_sep="_", drop_first=True, prefix=col)
        #df.drop(columns=col, inplace=True)
        d = pd.concat([d, dummies], axis=1)

    return d


def feature_eng(df):
    return pd.concat(
        [fe_target_var(df), fe_numerical_var(df), fe_categorical_var(df, col_category)], axis=1)
In [42]:
df_engineered = feature_eng(df)
df_engineered
Out[42]:
SALARY Age Minutes PER TS DRB AST STL BLK TOV ... TEAM_Oklahoma City Thunder TEAM_Orlando Magic TEAM_Philadelphia 76ers TEAM_Phoenix Suns TEAM_Portland Trail Blazers TEAM_Sacramento Kings TEAM_San Antonio Spurs TEAM_Toronto Raptors TEAM_Utah Jazz TEAM_Washington Wizards
0 1.355311 0.166667 0.650830 0.783493 0.743467 0.689132 0.340755 0.552771 0.573770 0.230022 ... 0 0 0 0 0 0 0 0 0 0
1 1.345036 0.277778 0.423716 0.567258 0.519656 0.416828 0.795425 0.645497 0.016393 0.612912 ... 0 0 0 0 0 0 0 0 1 0
2 2.314869 0.388889 0.706450 0.780327 0.675053 0.297998 0.924906 0.745356 0.049180 0.716399 ... 0 0 0 0 0 0 0 0 0 0
3 1.144714 0.111111 0.037080 0.281439 0.000000 0.206085 0.658521 0.833333 0.032787 0.851392 ... 0 0 0 0 0 0 0 0 0 0
4 1.418619 0.777778 0.100425 0.433727 0.569995 0.845154 0.137686 0.440959 0.327869 0.539449 ... 0 0 0 0 0 0 0 0 0 0
5 1.077918 0.166667 0.000772 0.000000 0.243047 0.460820 0.228326 0.500000 0.032787 0.341178 ... 0 0 0 0 0 0 0 0 0 0
6 1.846915 0.611111 0.718038 0.487467 0.468411 0.397871 0.698679 0.552771 0.131148 0.634126 ... 0 0 0 0 0 0 0 0 0 0
7 2.494068 0.333333 0.642719 0.637135 0.629781 0.240655 0.586176 0.687184 0.081967 0.544331 ... 0 0 0 0 0 0 0 0 0 0
8 2.289428 0.444444 0.953650 0.814473 0.714527 0.452364 0.878927 0.745356 0.065574 0.572750 ... 0 0 0 0 0 0 0 1 0 0
9 1.526051 0.166667 0.682117 0.512227 0.671920 0.443747 0.307875 0.600925 0.163934 0.384900 ... 0 0 0 0 0 0 0 1 0 0
10 1.843978 0.388889 0.835458 0.629317 0.808708 0.164399 0.389434 0.726483 0.016393 0.377964 ... 0 0 0 0 0 0 0 0 0 0
11 1.643593 0.500000 0.379683 0.754524 0.668773 0.829011 0.380197 0.527046 0.557377 0.442456 ... 0 0 0 0 0 0 0 0 0 1
12 2.351335 0.666667 0.450753 0.700071 0.595341 0.654654 0.580080 0.726483 0.409836 0.604218 ... 0 0 0 0 0 0 0 0 0 1
13 1.791158 0.722222 0.778293 0.580201 0.934568 0.489267 0.515173 0.600925 0.131148 0.581914 ... 0 0 0 0 0 0 0 0 0 0
14 2.512046 0.222222 0.999228 0.796030 0.573685 0.460820 0.953914 0.816497 0.180328 0.693889 ... 0 0 0 0 0 0 0 0 0 1
15 1.293689 0.166667 0.388953 0.648685 0.649570 0.697486 0.510552 0.552771 0.245902 0.682355 ... 0 0 0 0 0 0 0 0 0 0
16 1.058446 0.388889 0.144844 0.344691 0.290496 0.680676 0.194717 0.500000 0.459016 0.391713 ... 0 0 0 0 0 0 0 0 0 0
17 1.618062 0.222222 0.982619 0.767534 0.717473 0.340338 0.738257 0.500000 0.081967 0.476983 ... 0 0 0 0 1 0 0 0 0 0
18 2.136786 0.555556 0.339127 0.696526 0.729140 0.743050 0.435400 0.527046 0.295082 0.646521 ... 0 0 0 0 0 0 0 0 0 0
19 1.335247 0.277778 0.301661 0.398015 0.425951 0.716599 0.182141 0.408248 0.245902 0.404995 ... 0 1 0 0 0 0 0 0 0 0
20 2.817214 0.500000 0.773658 0.850160 0.806095 0.915335 0.413057 0.527046 0.655738 0.738223 ... 0 0 0 0 0 0 0 0 0 0
21 1.957434 0.666667 0.610660 0.605257 0.746299 0.541698 0.580080 0.500000 0.196721 0.693889 ... 0 0 0 0 0 0 1 0 0 0
22 1.734604 0.500000 0.547316 0.678525 0.705613 0.270849 0.404360 0.527046 0.081967 0.272166 ... 0 0 0 0 0 0 0 0 0 0
23 1.587401 0.166667 0.551178 0.487467 0.573685 0.608816 0.248216 0.552771 0.147541 0.465759 ... 0 0 0 0 0 0 0 0 0 0
24 1.866256 0.444444 0.037852 0.385376 0.297670 0.713900 0.429923 0.000000 0.114754 0.731021 ... 0 0 0 0 0 0 0 0 0 0
25 1.952966 0.777778 0.575898 0.789786 0.566282 0.872134 0.627188 0.372678 0.573770 0.524531 ... 0 0 0 0 0 0 0 0 0 0
26 1.650964 0.777778 0.846273 0.540444 0.659241 0.512395 0.491636 0.552771 0.229508 0.465759 ... 0 0 0 0 0 0 0 0 0 0
27 1.493802 0.333333 0.420626 0.652490 0.690502 0.651698 0.401419 0.471405 0.065574 0.404995 ... 0 0 0 0 0 0 0 0 0 0
28 2.498250 0.444444 0.778679 0.580201 0.472894 0.582897 0.311699 0.333333 0.934426 0.621485 ... 0 0 0 0 0 0 0 0 0 0
29 1.289616 0.166667 0.452298 0.540444 0.450035 0.362318 1.000000 0.600925 0.032787 0.986684 ... 0 0 1 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
90 1.411805 0.944444 0.446891 0.811428 0.779484 0.473221 0.798399 0.763763 0.131148 0.697691 ... 0 0 0 0 0 0 1 0 0 0
91 1.131640 0.777778 0.396292 0.521802 0.527713 0.639739 0.421575 0.645497 0.163934 0.348845 ... 0 0 0 0 0 1 0 0 0 0
92 1.096961 0.388889 0.242178 0.306691 0.435745 0.504803 0.337260 0.707107 0.196721 0.612912 ... 0 0 0 0 0 0 0 0 0 0
93 1.553616 0.222222 0.307068 0.696526 0.580993 0.856499 0.456652 0.645497 0.950820 0.686221 ... 0 0 0 0 0 0 0 0 0 0
94 1.418619 0.777778 0.665508 0.450522 0.450035 0.297998 0.696981 0.707107 0.163934 0.646521 ... 0 0 0 0 0 0 0 0 0 0
95 1.338687 0.166667 0.780610 0.660033 0.378761 0.602440 0.816013 0.816497 0.229508 0.716399 ... 0 0 0 0 0 0 0 0 0 0
96 2.700274 0.333333 0.928544 1.000000 0.782186 0.948480 0.676274 0.471405 0.163934 0.404995 ... 0 0 0 0 0 0 0 0 0 0
97 2.314782 0.444444 0.825801 0.757798 0.629781 0.576235 0.555029 0.687184 0.295082 0.608581 ... 0 0 0 0 0 1 0 0 0 0
98 2.381102 0.555556 0.834299 0.889988 0.602386 0.923733 0.503542 0.577350 0.393443 0.145479 ... 0 0 0 0 0 0 0 0 0 0
99 1.005114 0.277778 0.740054 0.455983 0.595341 0.460820 0.429923 0.623610 0.163934 0.514344 ... 0 0 0 0 0 1 0 0 0 0
100 1.650964 0.333333 0.750869 0.502469 0.734904 0.340338 0.248216 0.552771 0.213115 0.411476 ... 1 0 0 0 0 0 0 0 0 0
101 1.709976 0.388889 0.134415 0.575919 0.743467 0.711190 0.367520 0.577350 0.098361 0.662687 ... 0 0 0 0 0 0 0 0 0 0
102 1.777234 0.444444 0.681344 0.487467 0.782186 0.388045 0.330159 0.471405 0.098361 0.241249 ... 0 0 0 0 0 0 0 0 0 1
103 1.709976 0.888889 0.387408 0.764303 1.000000 0.773599 0.000000 0.527046 1.000000 0.563436 ... 0 0 0 0 0 0 0 0 0 0
104 1.267893 0.166667 0.143299 0.410264 0.633122 0.477283 0.257586 0.440959 0.327869 0.348845 ... 0 0 0 0 0 0 0 0 0 0
105 2.248693 0.333333 0.947470 0.928109 0.831855 0.425990 0.946433 0.745356 0.065574 0.686221 ... 0 0 0 0 0 0 0 0 0 0
106 1.426043 0.777778 0.388567 0.575919 0.498944 0.814919 0.443489 0.471405 0.147541 0.634126 ... 0 0 0 0 0 0 0 1 0 0
107 1.310371 0.555556 0.449594 0.535844 0.551178 0.138943 0.708780 0.623610 0.098361 0.674556 ... 0 0 0 0 0 0 0 0 0 0
108 1.783701 0.500000 0.610274 0.592862 0.734904 0.356950 0.413057 0.645497 0.196721 0.377964 ... 0 0 0 0 0 0 0 0 0 0
109 2.289428 0.222222 0.857860 0.707107 0.472894 0.456612 0.808720 0.645497 0.163934 0.482498 ... 0 0 0 0 0 0 0 0 0 0
110 2.018935 0.444444 0.855929 0.545004 0.662434 0.000000 0.370730 0.866025 0.147541 0.411476 ... 0 0 0 0 0 0 0 0 0 0
111 2.541055 0.111111 0.812283 0.814473 0.605878 0.356950 0.836093 0.745356 0.131148 0.509175 ... 0 0 0 0 0 0 0 0 0 0
112 1.727861 0.722222 0.341831 0.663773 0.598874 0.473221 0.476957 1.000000 0.196721 0.690066 ... 0 0 0 0 0 0 0 0 0 0
113 1.011533 0.277778 0.096176 0.734577 0.711568 0.790875 0.271035 0.600925 0.278689 0.325300 ... 0 0 0 1 0 0 0 0 0 0
114 1.631786 0.666667 0.689842 0.597022 0.539573 0.407459 0.884302 0.527046 0.032787 0.731021 ... 0 0 0 0 0 0 0 0 0 0
115 1.769840 0.388889 0.759753 0.592862 0.656033 0.541698 0.684978 0.552771 0.245902 0.509175 ... 0 0 0 0 0 0 0 0 0 0
116 1.466920 0.055556 0.692932 0.222497 0.405656 0.356950 0.296104 0.471405 0.114754 0.476983 ... 0 0 0 0 0 1 0 0 0 0
117 2.000000 0.277778 0.679799 0.761057 0.705613 0.674980 0.456652 0.623610 0.295082 0.549170 ... 0 0 0 0 0 0 0 0 0 1
118 2.015958 0.611111 0.741213 0.567258 0.678171 0.633675 0.319211 0.552771 0.344262 0.370899 ... 0 1 0 0 0 0 0 0 0 0
119 2.205947 0.277778 0.631518 0.761057 0.494698 0.595997 0.818912 0.745356 0.147541 0.621485 ... 0 0 0 0 0 0 0 0 0 0

120 rows × 47 columns

In [43]:
# selected_features = []

Modelling

In [44]:
X_train = df_engineered.iloc[:,1:]
y_train = df_engineered["SALARY"]
In [45]:
# df_engineered.columns
In [46]:
df_test = feature_eng(pd.read_csv("NBA_Test.csv"))
df_test['TEAM_Houston Rockets'] = 0
df_test = df_test[df_engineered.columns]

X_test = df_test.iloc[:,1:]
y_test = df_test["SALARY"]

Lasso

In [47]:
alpha = list(np.logspace(-4, -.5, 30))

lasso = LassoCV(cv=5, random_state=0, alphas=alpha)

lasso.fit(X_train, y_train)
Out[47]:
LassoCV(alphas=[0.0001, 0.00013203517797162948, 0.00017433288221999874,
                0.00023018073130224678, 0.0003039195382313198,
                0.0004012807031942776, 0.0005298316906283707,
                0.0006995642156712634, 0.0009236708571873865,
                0.0012195704601594415, 0.0016102620275609393,
                0.0021261123338996556, 0.0028072162039411755,
                0.0037065129109221566, 0.004893900918477494,
                0.006461670787466...
                0.02592943797404667, 0.03423597957607583, 0.04520353656360245,
                0.05968456995122311, 0.07880462815669913, 0.10404983103657853,
                0.1373823795883264, 0.1813930693911063, 0.2395026619987486,
                0.31622776601683794],
        copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=1000,
        n_alphas=100, n_jobs=None, normalize=False, positive=False,
        precompute='auto', random_state=0, selection='cyclic', tol=0.0001,
        verbose=False)
In [48]:
lasso.alpha_
Out[48]:
0.011264816923358867
In [49]:
lasso.coef_
Out[49]:
array([ 0.        ,  0.65824854,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.47699075,
        0.        , -0.14983528,  0.27756787, -0.        , -0.05618891,
       -0.        , -0.        , -0.        , -0.00300209, -0.        ,
       -0.03431591,  0.        ,  0.        , -0.        , -0.        ,
        0.        ,  0.        , -0.        , -0.        ,  0.        ,
        0.        ,  0.        , -0.        ,  0.        ,  0.        ,
        0.        , -0.        ,  0.        ,  0.        , -0.        ,
       -0.        , -0.        ,  0.        , -0.        , -0.        ,
        0.        ])
In [50]:
y_pred = lasso.predict(X_test)
In [51]:
# np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))
# np.sqrt(mean_squared_error(y_test, y_pred))
np.sqrt(mean_squared_error(y_test ** 3, y_pred**3))
Out[51]:
4.357051470967911
In [52]:
def plot_resid(y_test, y_pred):
    """

    """

    resid = y_test-y_pred

    plt.figure(figsize=(16, 9))
    plt.scatter(y_pred, resid)
    plt.axhline(0, 0, 1, color="g", ls="--")
    plt.xlabel("Fitted")
    plt.ylabel("Residual")
    plt.title("Residual Plot")
    plt.savefig("Residual plot ols")
    plt.show()


plot_resid(y_test, y_pred)

Gradient Boosting

In [53]:
tuning_parameters = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [250, 500, 750, 1000, 1500],
    'max_depth': [2, 3, 4],
    'subsample': [0.6, 0.8, 1.0]
}

# Using GridSearchCV would be too slow. Increase the number of iterations to explore more hyperparameter combinations.
gb = RandomizedSearchCV(GradientBoostingRegressor(
), tuning_parameters, n_iter=1, cv=10, return_train_score=False, n_jobs=4)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

print('Best parameters found by randomised search:', gb.best_params_, '\n')
Best parameters found by randomised search: {'subsample': 1.0, 'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.05} 

In [54]:
# np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))
# np.sqrt(mean_squared_error(y_test, y_pred))
np.sqrt(mean_squared_error(y_test ** 3, y_pred**3))
Out[54]:
4.284887197188837
In [55]:
plot_feature_importance(gb.best_estimator_, list(X_train))
plt.show()

RandomForestRegressor

In [56]:
tuning_parameters = {
    'min_samples_leaf': [1, 5, 10, 20, 50],
    'max_features': np.arange(1, X_train.shape[1], 5),
}

rf_search = RandomizedSearchCV(RandomForestRegressor(), tuning_parameters, cv=10, n_iter=20, return_train_score=True, n_jobs=4,
                               random_state=20)
rf_search.fit(X_train, y_train)

rf = rf_search.best_estimator_

print('Best parameters found by randomised search:', rf_search.best_params_, '\n')

y_pred = rf.predict(X_test)

np.sqrt(mean_squared_error(y_test ** 3, y_pred**3))
Best parameters found by randomised search: {'min_samples_leaf': 10, 'max_features': 16} 

Out[56]:
4.2967735428443365
In [57]:
plot_feature_importance(rf_search.best_estimator_, list(X_train))
plt.show()

Xgboost

In [58]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
reg_xgb = GridSearchCV(xgb_model,
                       {'max_depth': [2, 4, 6],
                        'n_estimators': [50, 100, 200]}, verbose=1)
reg_xgb.fit(X_train, y_train)

print(reg_xgb.best_score_)

print('Best parameters found by randomised search:', reg_xgb.best_params_, '\n')

y_pred = reg_xgb.predict(X_test)

np.sqrt(mean_squared_error(y_test ** 3, y_pred**3))
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
0.3320053305424023
Best parameters found by randomised search: {'max_depth': 2, 'n_estimators': 50} 

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    1.7s finished
Out[58]:
4.036398435677304
In [59]:
plot_feature_importance(reg_xgb.best_estimator_, list(X_train))
plt.show()
In [60]:
toc = time.time()
print("Training time: {0:.4f}s".format(toc - tic))
Training time: 51.2329s